##########################################################################
# Description: Descriptive information                                   #
# Author: Ozlem Tuncel                                                   #
# Title: Lecturer and Data Services Specialist                           #
# Affiliation: Georgia State University                                  #
# Department: Research Data Services & Political Science                 #
# Email: otuncelgurlek1@gsu.edu                                          #
# Date: 08/04/2025                                                       #
# R Version: R version 4.4.0 (2024-04-24 ucrt) -- "Puppy Cup"            #
# Computer: Dell Latitude 7450 x64-based PC                              #
# Processor: Intel(R) Core(TM) Ultra 7 165U, 2.10 GHz, 12 Cores          #
# BIOS version/date: Dell Inc. 1.3.0, 4/11/2024                          #
# OS: 22621.3880                                                         #
##########################################################################

# Clean slate
rm(list = ls())
set.seed(1234)
getwd()
# setwd() # use this to set the working directory

# Load library ----
library(tidyverse)      # version 2.0.0
library(xtable)         # version 1.8-4
library(tm)             # version 0.7-13
library(SnowballC)      # version 0.7.1
library(wordcloud)      # version 2.6
library(RColorBrewer)   # version 1.1-3

# Import data ----
my_data <- read_csv("replication_data/elite_interview_data.csv")
journal_data <- read_csv("replication_data/all_journals.csv")

# JOURNAL-LEVEL INFORMATION ----
unique(my_data$journal) # number of journals
length(unique(my_data$article_title)) # number of journal articles

# Create a variable for single and multi-author
my_data <- my_data |> 
  mutate(author_count = ifelse(is.na(author2), "Single-author", "Multi-author"))
table(my_data$author_count) # number of author count
round(prop.table(table(my_data$author_count))*100, 1)

# Qualitative and mixed-methods evidence
table(my_data$type_of_evidence)
round(prop.table(table(my_data$type_of_evidence))*100, 1)

table(my_data$evidence_details)
round(prop.table(table(my_data$evidence_details))*100, 1)

# Subfield distribution
table(my_data$subfield1)
round(prop.table(table(my_data$subfield1))*100, 1)

# Single-country
sum(my_data$single_country, na.rm = T)*100/nrow(my_data)

# FORMAT OF ELITE INTERVIEWS ----
# Sample size
summary(my_data$sample_size)
mean(my_data$sample_size, na.rm = T)

# Sample size is not mentioned
sum(is.na(my_data$sample_size))
sum(is.na(my_data$sample_size))*100/nrow(my_data)

missing_sample_size <- my_data |> 
  select(year, sample_size, article_title) |> 
  filter(is.na(my_data$sample_size))

table(missing_sample_size$year)

# Conduct of interviews
table(my_data$conduct)
round(prop.table(table(my_data$conduct))*100, 1)

# Interview format
sum(is.na(my_data$interview_method))*100/nrow(my_data)
table(my_data$interview_method, useNA = "always")
round(prop.table(table(my_data$interview_method, useNA = "always"))*100, 1)

# Interview questions
table(my_data$interview_questions)
round(prop.table(table(my_data$interview_questions))*100, 1)

# Recruitment strategies
my_data$recruitment
sum(is.na(my_data$recruitment))
sum(is.na(my_data$recruitment))*100/nrow(my_data)

# BEST PRACTICES IN INTERVIEWS ----
# Appendix information
table(my_data$appendix_interviews)
round(prop.table(table(my_data$appendix_interviews))*100, 1)

# Anonymity information
table(my_data$anonymity)
round(prop.table(table(my_data$anonymity))*100, 1)
round(prop.table(table(my_data$anonym_explanation))*100, 1)

# Data access
table(my_data$data_access, useNA = "always")
round(prop.table(table(my_data$data_access, useNA = "always"))*100, 1)

# IRB information
table(my_data$IRB)
table(my_data$IRB)*100/nrow(my_data)
sum(is.na(my_data$IRB))

IRB_mentioned <- my_data |> filter(IRB == "Yes")

table(IRB_mentioned$year)

# Correlation between appendix and anonymity
my_data$anonym_binary <- ifelse(my_data$anonym_explanation == "No", 0, 1)
table(my_data$IRB, my_data$year)

# Elite description
sum(is.na(my_data$elite_description))
sum(is.na(my_data$elite_description))*100/nrow(my_data)
table(my_data$elite_description)

# List of elites
table(my_data$list_interviewees)
round(prop.table(table(my_data$list_interviewees))*100, 1)

# Correlation tests
my_data$IRB_binary <- ifelse(my_data$IRB == "Yes", 1, 0)
my_data$IRB_binary[is.na(my_data$IRB_binary)] <- 0
my_data$consent_binary <- ifelse(my_data$consent_form == "Yes", 1, 0)

cor.test(my_data$appendix_interviews, my_data$anonym_binary, conf.level = 0.99)
cor.test(my_data$appendix_interviews, my_data$list_interviewees, conf.level = 0.99)
cor.test(my_data$appendix_interviews, my_data$interview_questions, conf.level = 0.99)
cor.test(my_data$appendix_interviews, my_data$IRB_binary, conf.level = 0.99)
cor.test(my_data$appendix_interviews, my_data$consent_binary, conf.level = 0.99)

